import os
NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = os.path.join(NOVA_HOME, 'input')
LOGS_PATH = os.path.join(NOVA_HOME, "outputs/preprocessing/ManuscriptFinalData_80pct/neuronsDay18/logs/")
PLOT_PATH = os.path.join(NOVA_HOME, "outputs/preprocessing/ManuscriptFinalData_80pct/neuronsDay18/logs/plots")
os.chdir(NOVA_HOME)
import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
run_calc_hist_new
from tools.preprocessing_tools.qc_reports.qc_config import opera18days_panels, opera18days_markers, opera18days_marker_info, \
opera18days_cell_lines, opera18days_cell_lines_to_cond,\
opera18days_cell_lines_for_disp, opera18days_reps, \
opera18days_line_colors, opera18days_lines_order, \
opera18days_custom_palette, opera18days_expected_dapi_raw, \
markers
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
df = log_files_qc(LOGS_PATH,only_wt_cond=False, filename_split='-',site_location=0)
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch2 reading logs of batch1 Total of 4 files were read. Before dup handeling (131158, 21) After duplication removal #1: (120135, 22) After duplication removal #2: (120135, 22)
# choose batches
batches = ['batch1', 'batch2']
batches
['batch1', 'batch2']
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'images', 'raw', 'Opera18DaysReimaged_sorted')
batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, opera18days_panels, opera18days_markers.copy(),PLOT_PATH, opera18days_marker_info,
opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
batches=batches_raw, fig_height=12)
batch1 Folder structure is valid. No bad files are found. Total Sites: 82000
======== batch2 Folder structure is invalid. Missing 2 paths: /home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/Opera18DaysReimaged_sorted/batch2/FUSHomozygous/panelD/stress/rep1/CLTC /home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/Opera18DaysReimaged_sorted/batch2/FUSHomozygous/panelD/stress/rep2/CLTC No bad files are found. Total Sites: 81800
======== ====================
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'images', 'processed', 'ManuscriptFinalData_80pct', 'neuronsDay18')
procs = run_validate_folder_structure(root_directory_proc, True, opera18days_panels, opera18days_markers,PLOT_PATH,opera18days_marker_info,
opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
batches=batches, fig_height=12)
batch1 Folder structure is invalid. Missing 10 paths: /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/WT/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/WT/stress/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSHomozygous/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSHomozygous/stress/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSHeterozygous/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/TBK1/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/TDP43/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSRevertant/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/OPTN/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/SNCA/Untreated/CD41 No bad files are found. Total Sites: 54596
======== batch2 Folder structure is invalid. Missing 15 paths: /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/WT/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/WT/stress/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/stress/CLTC /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/stress/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHeterozygous/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/TBK1/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/TDP43/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/SQSTM1 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/FMRP /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/Phalloidin /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/OPTN/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/SNCA/Untreated/CD41 /home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/SNCA/Untreated/Tubulin No bad files are found. Total Sites: 52681
======== ====================
display_diff(batches, raws, procs, PLOT_PATH,fig_height=12)
batch1
======== batch2
========
#for batch in list(range(3,9)) + ['7_16bit','8_16bit','9_16bit']:
for batch in batches:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc, batch,
sample_size_per_markers=200, cond_count=2, rep_count=len(opera18days_reps),
num_markers=len(opera18days_markers))
print(f'{batch} var: ',var)
batch1 var: 0.041773310225922025 batch2 var: 0.040858276841131386
By order of filtering
Percentage out of the total sites
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, opera18days_line_colors, opera18days_panels,
figsize=(10,6), reps = opera18days_reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner,
opera18days_line_colors, opera18days_panels, reps = opera18days_reps,
figsize=(10,6))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose,
opera18days_line_colors, opera18days_panels, figsize=(10,6),
reps = opera18days_reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
figsize=(10,10), markers=opera18days_markers)
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, opera18days_markers)
## Are we using FMRP?
markers_for_d18 = markers.copy()
markers_for_d18.remove('TIA1')
total_sum[total_sum.marker.isin(markers_for_d18)].n_valid_tiles.sum()
318137
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
78482.0
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
282929.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch1 | ||||
| count | 800.000000 | 800.000000 | 800.000000 | 800.000000 |
| mean | 238.811250 | 2.388112 | 166.008750 | 580.743750 |
| std | 139.461294 | 1.394613 | 152.195517 | 392.834608 |
| min | 1.000000 | 0.010000 | 1.000000 | 2.000000 |
| 25% | 132.000000 | 1.320000 | 93.000000 | 341.000000 |
| 50% | 241.000000 | 2.410000 | 149.000000 | 563.000000 |
| 75% | 332.250000 | 3.322500 | 208.000000 | 779.250000 |
| max | 583.000000 | 5.830000 | 2122.000000 | 4721.000000 |
| sum | 191049.000000 | NaN | 132807.000000 | 464595.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch2 | ||||
| count | 792.000000 | 792.000000 | 792.000000 | 792.000000 |
| mean | 225.335859 | 2.253359 | 133.366162 | 509.843434 |
| std | 133.733052 | 1.337331 | 82.621522 | 298.209843 |
| min | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 122.750000 | 1.227500 | 72.750000 | 283.000000 |
| 50% | 203.000000 | 2.030000 | 118.000000 | 465.000000 |
| 75% | 323.250000 | 3.232500 | 185.250000 | 724.000000 |
| max | 662.000000 | 6.620000 | 648.000000 | 1917.000000 |
| sum | 178466.000000 | NaN | 105626.000000 | 403796.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 1592.000000 | 1592.000000 | 1592.000000 | 1592.000000 |
| mean | 232.107412 | 2.321074 | 149.769472 | 545.471734 |
| std | 136.764821 | 1.367648 | 123.665234 | 350.670068 |
| min | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 124.000000 | 1.240000 | 80.000000 | 307.750000 |
| 50% | 226.000000 | 2.260000 | 134.000000 | 517.500000 |
| 75% | 330.000000 | 3.300000 | 200.000000 | 758.000000 |
| max | 662.000000 | 6.620000 | 2122.000000 | 4721.000000 |
| sum | 369515.000000 | NaN | 238433.000000 | 868391.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
For each batch, cell line, replicate and markerTotal number of tiles
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond FUSHeterozygous Untreated 2.924984 FUSHomozygous Untreated 3.255590 FUSHomozygous stress 2.503102 FUSRevertant Untreated 1.609745 OPTN Untreated 2.483955 SNCA Untreated 1.326592 TBK1 Untreated 1.895484 TDP43 Untreated 2.161961 WT Untreated 3.902980 WT stress 2.918544 Name: n_valid_tiles, dtype: float64
df_dapi[['site_cell_count']].mean()
site_cell_count 6.211258 dtype: float64
plot_catplot(df_dapi, opera18days_custom_palette, opera18days_reps,
x='n_valid_tiles', x_title='valid tiles count', batch_min=1, batch_max=2)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1061: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']
plot_hm_of_mean_cell_count_per_tile(df_dapi, split_by='rep', rows='cell_line', columns='panel')
for batch in batches:
print(batch)
run_calc_hist_new(f'{batch}',opera18days_cell_lines_for_disp, opera18days_markers,
root_directory_raw, root_directory_proc, hist_sample=10,
sample_size_per_markers=200, ncols=7, nrows=5)
print("="*30)
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_d18_Opera_80pct.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_d18_Opera_80pct.html')